knitr::opts_chunk$set(echo = TRUE,
eval = FALSE,# I already ran and saved the data in data folder
warning = FALSE,
message = FALSE,
error = FALSE)
# loading packages
library(rvest) # for web scraping functions using css tags
library(tidyverse) # for basic data wrangling
library(polite) # for checking website permissions
library(janitor) # for clean column names
library(RSelenium) # for scraping dynamic tables
library(netstat) # to find free port
The main results page on Butte to Butte website looks as this:
knitr::include_graphics(here::here("images/b2b_results_page.png"))
There are hyperlinks embedded on the page for each race year which redirects to the annual race pages. First, I will extract all the year-event page links.
# read the main results page on B2B
# save url in string
b2b_results_url <- "https://buttetobutte.com/results"
# see rules for scraping on the page and other bots information
results_session <- bow(b2b_results_url,
user_agent = "HK result scraping",
force = TRUE)
results_session # we have the permission to scrape!
# get annual links in a tibble
b2b_annual_race_links <- tibble(
year = scrape(results_session) %>%
# all year results in p - get those
html_element("p") %>%
# extracting hyperlinks from paragraph with each year has corresponding results
html_elements("a") %>%
html_text(),
link = scrape(results_session) %>%
# all year results in p - get those
html_element("p") %>%
# extracting hyperlinks from paragraph with each year has corresponding results
html_elements("a") %>%
html_attr('href')
)
# see data frame
head(b2b_annual_race_links)
We extracted links for 23 years. Though the race is ongoing for 50 years, since 1974, digital results are only available for 23 years, starting 2000 (no results for 2001).
# see links type
table(gsub("(.*com)(.*)", "\\1",b2b_annual_race_links$link))
The links are present in three formats - starting with nebula, onlineraceresults, and eclectivedgeracing
Nebula corresponds to pages with error - 4 years
For the other years, one of the two different webpage designs were used. I will write functions to extract results from each of those.
b2b_annual_race_links <- b2b_annual_race_links %>%
mutate(valid_link = ifelse(grepl("nebula", link), 0, 1),
html_design = ifelse(valid_link == 1, gsub("(.*com)(.*)", "\\1",link), NA),
html_design = factor(html_design, labels = c("orr", "eer")),
year = parse_number(year))
table(b2b_annual_race_links$valid_link) # Valid link available for 19 years
table(b2b_annual_race_links$html_design) # types of html design used
Elective Edge Racing (EER) style for 6 years (including 2024) and Online Race Results (ORR) for 13 years. In the 13 years that ORR was used, there were three different styles to organize the webpage. For 2015-2017, the results are available in text format. For prior years, the results aren’t in a text format but instead a scrollable table that uses Javascript. I’ll divide the workflow in two parts - scraping static webpages and scraping dynamic pages (I can scrape all ORR results with dynamic scraping method but I already wrote the code to scrape the text formatted table, so I’m playing along).
b2b_annual_race_links_static <- b2b_annual_race_links %>%
filter(valid_link == 1, year > 2014)
b2b_annual_race_links_dynamic <- b2b_annual_race_links %>%
filter(valid_link == 1, year < 2015)
Process:
knitr::include_graphics(here::here("images/eer_table_extract.gif"))
extract_table_eer <- function(link, year){
# check permissions
year_page_session <- bow(link,
user_agent = "HK result scraping",
force = TRUE)
# extract hyperlinks specific to results page - second on the each year's tab
year_results_link <- scrape(year_page_session) %>%
html_element(".nav-tabs") %>%
html_elements("li") %>%
html_elements("a") %>%
#html_text()
html_attr("href") %>%
.[2]
# extract overall-results links for each format type
year_results_session <- year_page_session %>%
nod(path = year_results_link)
race_codes <- tibble(
race_format = scrape(year_results_session) %>%
html_element(".tabs") %>%
html_elements("p") %>%
html_text(),
race_link_code = scrape(year_results_session) %>%
html_element(".tabs") %>%
html_elements("p") %>%
html_attr("race-code"),
race_links = paste0(year_results_link,"/",race_link_code))
results_data <- map2_df(race_codes$race_format, race_codes$race_links,
~year_results_session %>%
nod(path = .y) %>%
scrape(.) %>%
html_element("table") %>%
html_table() %>%
clean_names() %>%
# removing PII information
select(-c(x, name, bib, age)) %>%
mutate(
race_format = .x,
race_year = year
))
return(results_data)
}
# scrape eer style pages
eer_year_results <- b2b_annual_race_links_static %>%
filter(html_design == "eer")
eer_year_results <- map2_df(eer_year_results$link, eer_year_results$year,
~extract_table_eer(.x, .y))
Process:
knitr::include_graphics(here::here("images/orr_text_table_extract.gif"))
extract_table_orr <- function(link, year){
# check permissions
year_page_session <- bow(link,
user_agent = "HK result scraping",
force = TRUE)
# extract hyperlinks specific to results page - ending with Overall - Text Results
year_event_results_links <- tibble(
description = scrape(year_page_session) %>%
html_element("body") %>%
html_element("main") %>%
html_elements("a") %>%
html_text(),
links = scrape(year_page_session) %>%
html_element("body") %>%
html_element("main") %>%
html_elements("a") %>%
html_attr('href')) %>%
filter(grepl("Text", description)) %>%
# links have relative path - adding the first part for full address
mutate(links = paste0("https://onlineraceresults.com", links))
# extract text files from each event-format page
format_results_text_files <- map(year_event_results_links$links,
~ year_page_session %>%
nod(path = .x) %>%
scrape(.) %>%
html_element("pre") %>%
html_text() %>%
gsub("(.*)(Place )(.*)", "\\2\\3", .) %>%
gsub("\r", "", .) %>%
gsub("/", "_", .) %>%
str_split(., "\n") %>%
unlist()
)
# convert text file to dataframe
# initialize event_data inputs
results_data <- vector(mode = "list", length = length(format_results_text_files))
for(i in seq_along(format_results_text_files)){
# count columns width in the dataframe using "=" signs
column_size <- str_split(format_results_text_files[[i]][2], "= ") %>%
unlist()
column_size <- map_dbl(column_size[1:(length(column_size)-1)],
~str_count(.x,"=")+2)
column_size <- cumsum(column_size)
column_size <- c(1, column_size)
# store column names
column_names <- vector(mode = "character", length = length(column_size) - 1)
for (j in 2:length(column_size)){
column_names[j - 1] <- substr(format_results_text_files[[i]][1],
column_size[j-1], column_size[j]) %>%
gsub(" ","",.)
}
# make a dataframe to store each row's values
results_data[[i]] <- tibble(
column_names,
dummy_val = 1
) %>%
pivot_wider(
names_from = column_names,
values_from = dummy_val
)
# initialize row inputs
row_value <- vector(mode = "character", length = length(column_size) - 1)
for(j in 3:length(format_results_text_files[[i]])){
row_value <- map_chr(2:length(column_size), ~substr(format_results_text_files[[i]][j], column_size[.x-1], column_size[.x]) %>%
gsub(" ","",.))
results_data[[i]] <- rbind(results_data[[i]], row_value)}
}
# add event detail and year to each data file
# then reduce to a single data frame
results_data <- map2(results_data,
year_event_results_links$description, ~.x %>%
mutate(race_format = gsub("(.*)( Overall - Text Format)", "\\1", .y),
race_year = year) %>%
clean_names() %>%
filter(place!= "", name != "1") %>%
select(-c(name, ag, no))) %>%
reduce(., bind_rows)
return(results_data)
}
# scrape orr style pages
orr_year_results <- b2b_annual_race_links_static %>%
filter(html_design == "orr")
orr_year_results <- map2_df(orr_year_results$link, orr_year_results$year,
~extract_table_orr(.x, .y))
Process:
knitr::include_graphics(here::here("images/dynamic_table_extract.gif"))
# get result-links to loop through
# within each link, the steps are to click see all results, see all rows, and scrape the webpage
extract_hyperlinks <- function(link, year){
# check permissions
year_page_session <- bow(link,
user_agent = "HK result scraping",
force = TRUE)
# extract hyperlinks specific to results page - ending with Overall - Text Results
year_event_results_links <- tibble(
description = scrape(year_page_session) %>%
html_elements(xpath = '//*[(@id = "orr-event-races")]') %>%
html_elements("a") %>%
html_text(),
links = scrape(year_page_session) %>%
html_elements(xpath = '//*[(@id = "orr-event-races")]') %>%
html_elements("a") %>%
html_attr('href'),
nrow = length(description),
further_filter = ifelse(nrow > 2, 1, 0),
link_to_keep = ifelse(further_filter == 1 & grepl("Overall", description), 1, 0),
year = year
) %>%
filter(further_filter == 0 | (further_filter == 1 & link_to_keep == 1)) %>%
# links have relative path - adding the first part for full address
mutate(links = paste0("https://onlineraceresults.com", links))
return(year_event_results_links)
}
dynamic_hyperlinks <- map2_df(b2b_annual_race_links_dynamic$link,
b2b_annual_race_links_dynamic$year,
~extract_hyperlinks(.x, .y))
# set-up driver_object and start browser
rs_driver_object <- rsDriver(
browser = "chrome",
chromever = "126.0.6478.126",
verbose = TRUE,
port = netstat::free_port()
)
# set up the client side
rs_driver_client <- rs_driver_object$client
Now, we will navigate to each website from out hyperlink page. Then, we will click show all results, show all rows, and scrape the page. We will
extract_table_orr_dynamic <- function(link, year, format){
# navigate to the page
rs_driver_client$navigate(link)
# click show all results
show_all_results <- rs_driver_client$findElement(using = 'xpath', '//*[(@id = "orr-race-results-search")]//li[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//a')
show_all_results$clickElement()
# click show all rows
show_all_rows <- rs_driver_client$findElement(using = 'xpath', '//*[(@id = "orr-main-event-race-results")]//a[(((count(preceding-sibling::*) + 1) = 5) and parent::*)]')
show_all_rows$clickElement()
# get page source and scrape page
table_location <- rs_driver_client$findElement(using = 'id',
'race-results-table')
table_html <- table_location$getPageSource()
# extract table
table_df <- read_html(table_html %>% unlist()) %>%
# it's the third table
html_table()
# it's either the third or the first. Earleir years have pages that only have one table
results_table_index <- ifelse(length(table_df) > 1, 3, 1)
table_df <- table_df %>%
.[[results_table_index]] %>%
clean_names() %>%
# drop PII and irrelevant cols
select(-c(x, x_2, fn, ln, no)) %>%
mutate(race_format = format,
race_year = year)
return(table_df)
}
# loop through hyperlinks and get the tables
dynamic_table_results <- pmap(list(dynamic_hyperlinks$links,
dynamic_hyperlinks$year,
dynamic_hyperlinks$description),
~extract_table_orr_dynamic(..1, ..2, ..3))
So satisfying when it magically works!
knitr::include_graphics(here::here("images/dynamic_scraping.gif"))
# close dynamic browser
rs_driver_client$closeall()
#collapse list elements into one dataframe
dynamic_year_results <- reduce(dynamic_table_results, bind_rows)
# harmonizing column names
str(eer_year_results)
str(orr_year_results)
str(dynamic_year_results)
Just realized there is no variable ‘state’ in the tables I scraped dynamically - which was kind of the whole point. For now, I’m harmonizing the files and might come back to the project to fix the state information.
# changing colnames and col style to err_year_results style
orr_year_results <- orr_year_results %>%
rename("sex" = "s",
"state" = "st",
"gun_time" = "time") %>% # assuming it will be the guntime since rfid wasn't too common a couple years ago
mutate(div_place = gsub("(\\d+)(_)(\\d+)", "\\1", div_tot),
div_total = gsub("(\\d+)(_)(\\d+)", "\\3", div_tot),
div = gsub("([A-Z]\\d{2})(\\d{2})", "\\1-\\2", div),
across(c(place, div_place, div_total), as.integer)
) %>%
select(-div_tot)
dynamic_year_results <- dynamic_year_results %>%
rename("place" = "overall",
"div"= "division",
"gun_time" = "time") %>% #same assumption
mutate(div_place = gsub("(\\d+)(/)(\\d+)", "\\1", divpl),
div_total = gsub("(\\d+)(/)(\\d+)", "\\3", divpl),
sex_place = gsub("(\\d+)(/)(\\d+)", "\\1", sexpl),
sex_total = gsub("(\\d+)(/)(\\d+)", "\\3", sexpl),
div = gsub("([A-Z]\\d{2})(\\d{2})", "\\1-\\2", div),
sex = gsub("([A-Z])(.*)", "\\1", div),
race_format = gsub(" Overall| Run| Mayor's", "", race_format),
across(c(place, div_place, div_total, sex_place, sex_total), as.integer)) %>%
select(-c(divpl, sexpl))
#changing race_format from 4 Mile Walk to 4M Walk in eer_year_results to be consistent
eer_year_results <- eer_year_results %>%
mutate(race_format = gsub(" Mile","M", race_format))
#join
all_year_results <- bind_rows(eer_year_results, orr_year_results, dynamic_year_results)
write.csv(all_year_results, "data/butte_to_butte_results_2000_2024.csv")